Data Loading

In part I, we tested four models: kNN, LDA, QDA, and Logistic Regression. We use the dataset from part I as the training dataset fo part II of the project. We compare the following models: kNN, LDA, QDA, Logistic Regression, Random Forest, and SVM.

Preliminary Steps

We attach the packages required for the analysis.

library(tidyverse)  # data manipulation
library(ggplot2)    # plots
library(caret)      # model training
library(ROCR)       # ROC tools
library(viridis)    # ggplot coloring
library(e1071)      # SVM
library(kernlab)    # SVM

Training Data

# load training data from part 1
tuning <- read.csv("HaitiPixels.csv")
names(tuning) <- tolower(names(tuning))   # lowercase col names

# Make response variable binary
tuning$class <- ifelse(tuning$class == "Blue Tarp", "tarp", "other") %>% factor()

# RGB variables to numeric
tuning$red <- as.numeric(tuning$red)
tuning$green <- as.numeric(tuning$green)
tuning$blue <- as.numeric(tuning$blue)

# preview
summary(tuning)
##    class            red          green            blue      
##  other:61219   Min.   : 48   Min.   : 48.0   Min.   : 44.0  
##  tarp : 2022   1st Qu.: 80   1st Qu.: 78.0   1st Qu.: 63.0  
##                Median :163   Median :148.0   Median :123.0  
##                Mean   :163   Mean   :153.7   Mean   :125.1  
##                3rd Qu.:255   3rd Qu.:226.0   3rd Qu.:181.0  
##                Max.   :255   Max.   :255.0   Max.   :255.0
# boxplots of each RGB variable by surface type
tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x=class, y=value))+geom_boxplot(aes(col = "white", fill = variable))+
    scale_fill_manual(values = c(red = "#FF4E3F", green = "#50E23F", blue = "#504EB5"), guide = 'none')+
    scale_color_manual(values = "white", guide = 'none')+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    facet_wrap(.~variable)+ggtitle("Training Data")

tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot()+
  geom_density(col = NA, alpha = 0.5, aes(x=value, fill = variable))+
  scale_fill_manual(values = c(red = "#FF4E3F", green = "#50E23F", blue = "#504EB5"), guide = 'none')+
  xlim(0,255)+ ylim(0, 0.025)+
  geom_boxplot(lwd = 1, fatten = 1, fill = NA, width = 0.005, aes(x=value, y = 0.005))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title.y = element_blank())+
  facet_wrap(.~variable, nrow = 3)+ggtitle("Training Data")

tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x=value, y=class))+
  geom_violin(col = "white", aes(fill = variable))+
  scale_fill_manual(values = c(red = "#9C2C34", green = "#38c08c", blue = "#1E5B79"), guide = 'none')+
  geom_boxplot(col= "white", alpha=0.01, width = 0.05, fatten = 1)+
  #stat_summary(fun.data = data_summary)+
  xlim(0,255)+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title.y = element_blank(), axis.title.x = element_blank())+
  facet_wrap(.~variable)+ggtitle("Training Data")

# Create interactive plot
options(rgl.useNULL=TRUE)
plotids <- rgl::plot3d(x = tuning$red,
            y = tuning$green,
            z = tuning$blue,
            col = rgb(tuning$red, tuning$green, tuning$blue, maxColorValue = 255),
            xlab = "Red",
            ylab = "Green",
            zlab = "Blue",
            xlim = c(0,255),
            ylim = c(0,255),
            zlim = c(0,255),
            main = "Training Data")
rgl::rglwidget()

Holdout Data

There are seven files to read in identified by the numbers 057, 067, 069, and 078.For each of 067, 069, and 078 there are a pair of files– one for tarps and one for non-tarps. File 057 has non-tarp observations.

We create a function to process the holdout data .txt files.

# write a function to process text files

processPixels <- function(fileName){
    if(!is.character(fileName)){
        stop("Please provide string input.")
    } else if(!grepl(".txt", fileName)){
        stop("File Name invalid.")
    }
    
    dt <- read.table(fileName, skip = 8, header = F)    # read in data from file
    dt <- dt[, (ncol(dt)-2):ncol(dt)]                   # keep only RGB cols
    names(dt) <- c("red", "green", "blue")              # add col names
    
    # if NOT or NON in file name, label data as other, else tarp
    if(grepl("NOT", fileName) | grepl("NON", fileName)){
        dt$class <- rep("other", nrow(dt))
    } else{
        dt$class <- rep("tarp", nrow(dt))
    }
    
    dt$class <- dt$class %>% factor()
    dt$red <- as.numeric(dt$red)
    dt$green <- as.numeric(dt$green)
    dt$blue <- as.numeric(dt$blue)
    
    return(dt)
}

Now we process the text files and combine into a single holdout set. It is assumed that the text files exist in the present directory.

other057 <- processPixels("orthovnir057_ROI_NON_Blue_Tarps.txt")
tarp067 <- processPixels("orthovnir067_ROI_Blue_Tarps.txt")
other067 <- processPixels("orthovnir067_ROI_NOT_Blue_Tarps.txt")
tarp069 <- processPixels("orthovnir069_ROI_Blue_Tarps.txt")
other069 <- processPixels("orthovnir069_ROI_NOT_Blue_Tarps.txt")
tarp078 <- processPixels("orthovnir078_ROI_Blue_Tarps.txt")
other078 <- processPixels("orthovnir078_ROI_NON_Blue_Tarps.txt")

holdout <- rbind(other057, tarp067, other067, tarp069, other069, tarp078, other078)
rm(other057, tarp067, other067, tarp069, other069, tarp078, other078)

write.csv(holdout, "holdout.csv")

summary(holdout)
##       red            green            blue          class        
##  Min.   : 27.0   Min.   : 28.0   Min.   : 25.00   other:1989697  
##  1st Qu.: 76.0   1st Qu.: 71.0   1st Qu.: 55.00   tarp :  14480  
##  Median :107.0   Median : 91.0   Median : 66.00                  
##  Mean   :118.3   Mean   :105.4   Mean   : 82.36                  
##  3rd Qu.:139.0   3rd Qu.:117.0   3rd Qu.: 88.00                  
##  Max.   :255.0   Max.   :255.0   Max.   :255.00

Preview the holdout data.

# boxplots of each RGB variable by surface type
holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x=class, y=value))+geom_boxplot(aes(col = "white", fill = variable))+
    scale_fill_manual(values = c(red = "#8B4737", green = "#4C7537", blue = "#4C4758"), guide = 'none')+
    scale_color_manual(values = "white", guide = 'none')+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    facet_wrap(.~variable)+ggtitle("Holdout Data")

holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot()+
  geom_density(col = NA, alpha = 0.5, aes(x=value, fill = variable))+
  scale_fill_manual(values = c(red = "#8B4737", green = "#4C7537", blue = "#4C4758"), guide = 'none')+
  xlim(0,255)+ylim(0,0.025)+
  geom_boxplot(lwd = 1, fatten = 1, fill = NA, width = 0.005, aes(x=value, y = 0.005))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title.y = element_blank())+
  facet_wrap(.~variable, nrow = 3)+ggtitle("Holdout Data")

holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x=value, y=class))+
  geom_violin(col = "white", aes(fill = variable))+
  scale_fill_manual(values = c(red = "#9C2C34", green = "#38c08c", blue = "#1E5B79"), guide = 'none')+
  geom_boxplot(col= "white", alpha=0.01, width = 0.05, fatten = 1)+
  #stat_summary(fun.data = data_summary)+
  xlim(0,255)+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title.y = element_blank(), axis.title.x = element_blank())+
  facet_wrap(.~variable)+ggtitle("Holdout Data")

# Create interactive plot
# The holdout data is ~ 2e10^6 observation. The plot takes a long time to generate.
# We randomly sample 5% of the holdout set and plot only these points.

set.seed(199)
holdout.sample <- holdout[sample(nrow(holdout), round(0.005*nrow(holdout))), ]

options(rgl.useNULL=TRUE)
plotids <- rgl::plot3d(x = holdout.sample$red,
            y = holdout.sample$green,
            z = holdout.sample$blue,
            col = rgb(holdout.sample$red,
                      holdout.sample$green,
                      holdout.sample$blue,
                      maxColorValue = 255),
            xlab = "Red",
            ylab = "Green",
            zlab = "Blue",
            xlim = c(0,255),
            ylim = c(0,255),
            zlim = c(0,255),
            main = "Holdout Data")
rgl::rglwidget()

Data Splitting

Outline methodology for splitting training data.

  • Determine train/test ratio
  • Use train subset to tune model parameters (if necessary)
  • Evaluate models on test subset
  • re-fit model with entire training set?
  • Evaluate models on holdout set
set.seed(199)
train <- createDataPartition(tuning$class, p = 0.5, list = F)

# create new dataframes of test data to be passed to predict()
testX <- tuning[-train, ] %>% dplyr::select(-class)
testLabels <- tuning[-train, 'class']

# split holdout data similarly
holdoutX <- holdout %>% select(-class)
holdoutLabels <- holdout$class

# create list of seeds for kNN and other 1-parameter models
set.seed(199)
seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(seeds)-1)) seeds[[i]]<- sample.int(n=1000, 20)
seeds[[51]]<-sample.int(1000, 1)

# create list of seeds for SVM (RBF kernel) and other 2-parameter models
set.seed(199)
# ... how do I do this ...

# trainControl for tuning
ctrl <- trainControl(method= "repeatedcv", repeats = 5, seeds = seeds)

k-Nearest Neighbors

Training

We use the training data to tune the model. We want to identify the value of \(k\) that provides the lowest cross-validation error. The train() function from the caret package is used.

knn.tuning <- train(class~.,
                    data = tuning,
                    subset = train,
                    method = "knn",
                    metric = "Accuracy",
                    trControl = ctrl,
                    preProcess = c("center","scale"),
                    tuneLength = 15)
                    #tuneGrid = expand.grid(k = seq(5,24)))

knn.tuning
## k-Nearest Neighbors 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28458, 28459, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9967996  0.9485798
##    7  0.9967363  0.9473888
##    9  0.9966478  0.9459434
##   11  0.9966478  0.9458139
##   13  0.9966098  0.9451114
##   15  0.9966667  0.9460311
##   17  0.9965655  0.9443363
##   19  0.9965403  0.9439034
##   21  0.9965276  0.9436469
##   23  0.9964137  0.9418620
##   25  0.9964201  0.9418399
##   27  0.9964074  0.9415390
##   29  0.9963315  0.9402337
##   31  0.9962809  0.9393528
##   33  0.9961734  0.9375580
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

10-fold cross-validation, repeated five times, selects k = 5.

knn.preds <- predict(knn.tuning, newdata = testX)
confusionMatrix(knn.preds, testLabels, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30547    29
##      tarp     62   982
##                                           
##                Accuracy : 0.9971          
##                  95% CI : (0.9965, 0.9977)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9542          
##                                           
##  Mcnemar's Test P-Value : 0.0007951       
##                                           
##             Sensitivity : 0.97132         
##             Specificity : 0.99797         
##          Pos Pred Value : 0.94061         
##          Neg Pred Value : 0.99905         
##              Prevalence : 0.03197         
##          Detection Rate : 0.03106         
##    Detection Prevalence : 0.03302         
##       Balanced Accuracy : 0.98464         
##                                           
##        'Positive' Class : tarp            
## 

The Accuracy on the test subset is 0.9971

Holdout

For sake of comparison, we make predictions on the holdout set using both models.

# Predict labels on holdout data. Create confusion matrix
knn.holdout.preds <- predict(knn.tuning, newdata = holdoutX)
confusionMatrix(knn.holdout.preds, holdoutLabels, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1974768    1918
##      tarp    14929   12562
##                                           
##                Accuracy : 0.9916          
##                  95% CI : (0.9915, 0.9917)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5948          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.867541        
##             Specificity : 0.992497        
##          Pos Pred Value : 0.456950        
##          Neg Pred Value : 0.999030        
##              Prevalence : 0.007225        
##          Detection Rate : 0.006268        
##    Detection Prevalence : 0.013717        
##       Balanced Accuracy : 0.930019        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
knn.holdout.probs <- predict(knn.tuning, newdata = holdoutX, type = "prob")

knn.rates <- prediction(knn.holdout.probs[2], holdoutLabels)
knn.ROC <- performance(knn.rates, measure = "tpr", x.measure = "fpr")

knn.auc <- performance(knn.rates, measure = "auc")
knn.auc@y.values #auc = 0.9625007
## [[1]]
## [1] 0.9625007
# colorize ROC
knn.roc.df <- data.frame(c("FPR" = knn.ROC@x.values, "TPR" = knn.ROC@y.values, "Threshold" = knn.ROC@alpha.values))
knn.roc.df$Threshold[1] <- 1 # replace infinity

knn.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of kNN (k = 5)")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9625")

# create grid
boundary.grid <- expand.grid(red = seq(25,255,length.out=151),
                             green = seq(25,255,length.out=151),
                             blue = seq(25,255,length.out=151))

# predict over boundary.grid
boundary.probs <- predict(knn.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

colPal <- c("#ff822e", "#ff9751", "#ffac74", "#ffc197", "#ffd5b9", 
            "#ffeadc","#ffffff", "#dfedf9", "#c0dbf2", "#a0c9ec",
            "#80b6e6", "#61a4df", "#4192d9")

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(knn.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
                size = 2, stroke = 1.3,
                aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("kNN Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("kNN Decision Boundary [Blue against Green]")

Linear Discrimnant Analysis

Training

Again, we use train() to fit the LDA model. There are no parameters to tune.

lda.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)

lda.tuning <- train(class~.,
                    data = tuning,
                    subset = train,
                    method = "lda",
                    metric = "Accuracy",
                    trControl = lda.ctrl)
lda.tuning
## Linear Discriminant Analysis 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.98353   0.7459956

We make predictions on the test subset.

lda.preds <- predict(lda.tuning, newdata = testX)
confusionMatrix(lda.preds, testLabels, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30307   191
##      tarp    302   820
##                                          
##                Accuracy : 0.9844         
##                  95% CI : (0.983, 0.9857)
##     No Information Rate : 0.968          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7608         
##                                          
##  Mcnemar's Test P-Value : 7.265e-07      
##                                          
##             Sensitivity : 0.81108        
##             Specificity : 0.99013        
##          Pos Pred Value : 0.73084        
##          Neg Pred Value : 0.99374        
##              Prevalence : 0.03197        
##          Detection Rate : 0.02593        
##    Detection Prevalence : 0.03548        
##       Balanced Accuracy : 0.90061        
##                                          
##        'Positive' Class : tarp           
## 

Holdout

lda.holdout.preds <- predict(lda.tuning, newdata = holdoutX)
confusionMatrix(lda.holdout.preds, holdoutLabels, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1955460    2326
##      tarp    34237   12154
##                                           
##                Accuracy : 0.9818          
##                  95% CI : (0.9816, 0.9819)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3926          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.839365        
##             Specificity : 0.982793        
##          Pos Pred Value : 0.261990        
##          Neg Pred Value : 0.998812        
##              Prevalence : 0.007225        
##          Detection Rate : 0.006064        
##    Detection Prevalence : 0.023147        
##       Balanced Accuracy : 0.911079        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
lda.holdout.probs <- predict(lda.tuning, newdata = holdoutX, type = "prob")

lda.rates <- prediction(lda.holdout.probs[2], holdoutLabels)
lda.ROC <- performance(lda.rates, measure = "tpr", x.measure = "fpr")

lda.auc <- performance(lda.rates, measure = "auc")
lda.auc@y.values #auc = 0.9921876
## [[1]]
## [1] 0.9921876
# colorize ROC
lda.roc.df <- data.frame(c("FPR" = lda.ROC@x.values, "TPR" = lda.ROC@y.values, "Threshold" = lda.ROC@alpha.values))
lda.roc.df$Threshold[1] <- 1 # replace infinity

lda.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of LDA")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9922")

# predict over boundary.grid.
boundary.probs <- predict(lda.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(lda.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
                size = 2, stroke = 1.3,
                aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("LDA Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("LDA Decision Boundary [Blue against Green]")

Quadratic Discriminant Analysis

Training

We fit a QDA model with train().

qda.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)

qda.tuning <- train(class~.,
                    data = tuning,
                    subset = train,
                    method = "qda",
                    metric = "Accuracy",
                    trControl = qda.ctrl)

qda.tuning
## Quadratic Discriminant Analysis 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9941811  0.8974676

Make predictions on the tuning subset.

qda.preds <- predict(qda.tuning, newdata = testX)
confusionMatrix(qda.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30596   145
##      tarp     13   866
##                                           
##                Accuracy : 0.995           
##                  95% CI : (0.9942, 0.9958)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9138          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.85658         
##             Specificity : 0.99958         
##          Pos Pred Value : 0.98521         
##          Neg Pred Value : 0.99528         
##              Prevalence : 0.03197         
##          Detection Rate : 0.02739         
##    Detection Prevalence : 0.02780         
##       Balanced Accuracy : 0.92808         
##                                           
##        'Positive' Class : tarp            
## 

Holdout

Use the QDA model to make predictions on holdout.

qda.holdout.preds <- predict(qda.tuning, newdata = holdoutX)
confusionMatrix(qda.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1985998    4139
##      tarp     3699   10341
##                                          
##                Accuracy : 0.9961         
##                  95% CI : (0.996, 0.9962)
##     No Information Rate : 0.9928         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7232         
##                                          
##  Mcnemar's Test P-Value : 7.099e-07      
##                                          
##             Sensitivity : 0.714157       
##             Specificity : 0.998141       
##          Pos Pred Value : 0.736538       
##          Neg Pred Value : 0.997920       
##              Prevalence : 0.007225       
##          Detection Rate : 0.005160       
##    Detection Prevalence : 0.007005       
##       Balanced Accuracy : 0.856149       
##                                          
##        'Positive' Class : tarp           
## 
# Predict Probabilities for ROC
qda.holdout.probs <- predict(qda.tuning, newdata = holdoutX, type = "prob")

qda.rates <- prediction(qda.holdout.probs[2], holdoutLabels)
qda.ROC <- performance(qda.rates, measure = "tpr", x.measure = "fpr")

qda.auc <- performance(qda.rates, measure = "auc")
qda.auc@y.values #auc = 0.9926837
## [[1]]
## [1] 0.9926837
# colorize ROC
qda.roc.df <- data.frame(c("FPR" = qda.ROC@x.values, "TPR" = qda.ROC@y.values, "Threshold" = qda.ROC@alpha.values))
qda.roc.df$Threshold[1] <- 1 # replace infinity

qda.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of QDA")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9927")

# predict over boundary.grid.
boundary.probs <- predict(qda.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(qda.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
                size = 2, stroke = 1.3,
                aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("QDA Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("QDA Decision Boundary [Blue against Green]")

Logistic Regression

Training

logistic.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)

logistic.tuning <- train(class~.,
                         data = tuning,
                         subset = train,
                         method = "glm",
                         family = "binomial",
                         metric = "Accuracy",
                         trControl = logistic.ctrl)
logistic.tuning
## Generalized Linear Model 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9947314  0.9107111

Make prediction on the testing subset.

logistic.preds <- predict(logistic.tuning, newdata = testX)
confusionMatrix(logistic.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30579    99
##      tarp     30   912
##                                           
##                Accuracy : 0.9959          
##                  95% CI : (0.9952, 0.9966)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9318          
##                                           
##  Mcnemar's Test P-Value : 2.137e-09       
##                                           
##             Sensitivity : 0.90208         
##             Specificity : 0.99902         
##          Pos Pred Value : 0.96815         
##          Neg Pred Value : 0.99677         
##              Prevalence : 0.03197         
##          Detection Rate : 0.02884         
##    Detection Prevalence : 0.02979         
##       Balanced Accuracy : 0.95055         
##                                           
##        'Positive' Class : tarp            
## 

Holdout

logistic.holdout.preds <- predict(logistic.tuning, newdata = holdoutX)
confusionMatrix(logistic.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1969423     159
##      tarp    20274   14321
##                                           
##                Accuracy : 0.9898          
##                  95% CI : (0.9897, 0.9899)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5794          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.989019        
##             Specificity : 0.989811        
##          Pos Pred Value : 0.413962        
##          Neg Pred Value : 0.999919        
##              Prevalence : 0.007225        
##          Detection Rate : 0.007146        
##    Detection Prevalence : 0.017261        
##       Balanced Accuracy : 0.989415        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
logistic.holdout.probs <- predict(logistic.tuning, newdata = holdoutX, type = 'prob')

logistic.rates <- prediction(logistic.holdout.probs[2], holdoutLabels)
logistic.ROC <- performance(logistic.rates, measure = 'tpr', x.measure = 'fpr')

logistic.auc <- performance(logistic.rates, measure = "auc")
logistic.auc@y.values #auc = 0.9994541
## [[1]]
## [1] 0.9994541
# colorize ROC
logistic.roc.df <- data.frame(c("FPR" = logistic.ROC@x.values, 
                                "TPR" = logistic.ROC@y.values, 
                                "Threshold" = logistic.ROC@alpha.values))
logistic.roc.df$Threshold[1] <- 1 # replace infinity

logistic.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of Logistic Regression")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9995")

# predict over boundary.grid.
boundary.probs <- predict(logistic.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(logistic.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
                size = 2, stroke = 1.3,
                aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Logistic Regression Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Logistic Regression Decision Boundary [Blue against Green]")

Random Forest

Training

rf.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)

rf.tuning <- train(class~.,
                   data = tuning,
                   subset = train,
                   method = 'rf',
                   metric = 'Accuracy',
                   trControl = rf.ctrl,
                   tuneGrid = expand.grid(mtry = c(2,3)))
rf.tuning
## Random Forest 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9961292  0.9368615
##   3     0.9960216  0.9352254
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rf.preds <- predict(rf.tuning, newdata = testX)
confusionMatrix(rf.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30553    41
##      tarp     56   970
##                                           
##                Accuracy : 0.9969          
##                  95% CI : (0.9963, 0.9975)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9508          
##                                           
##  Mcnemar's Test P-Value : 0.1552          
##                                           
##             Sensitivity : 0.95945         
##             Specificity : 0.99817         
##          Pos Pred Value : 0.94542         
##          Neg Pred Value : 0.99866         
##              Prevalence : 0.03197         
##          Detection Rate : 0.03068         
##    Detection Prevalence : 0.03245         
##       Balanced Accuracy : 0.97881         
##                                           
##        'Positive' Class : tarp            
## 

Holdout

rf.holdout.preds <- predict(rf.tuning, newdata = holdoutX)
confusionMatrix(rf.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1974603    2336
##      tarp    15094   12144
##                                           
##                Accuracy : 0.9913          
##                  95% CI : (0.9912, 0.9914)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5782          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.838674        
##             Specificity : 0.992414        
##          Pos Pred Value : 0.445848        
##          Neg Pred Value : 0.998818        
##              Prevalence : 0.007225        
##          Detection Rate : 0.006059        
##    Detection Prevalence : 0.013591        
##       Balanced Accuracy : 0.915544        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
rf.holdout.probs <- predict(rf.tuning, newdata = holdoutX, type = 'prob')

rf.rates <- prediction(rf.holdout.probs[2], holdoutLabels)
rf.ROC <- performance(rf.rates, measure = "tpr", x.measure = "fpr")

rf.auc <- performance(rf.rates, measure = "auc")
rf.auc@y.values #auc = 0.9798993
## [[1]]
## [1] 0.9798993
# colorize ROC
rf.roc.df <- data.frame(c("FPR" = rf.ROC@x.values, "TPR" = rf.ROC@y.values, "Threshold" = rf.ROC@alpha.values))
rf.roc.df$Threshold[1] <- 1 # replace infinity

rf.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of Random Forest (mtry = 2)")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9799")

# predict over boundary.grid.
boundary.probs <- predict(rf.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(rf.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
  theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
  geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
              size = 2, stroke = 1.3, 
              aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Random Forest Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Random Forest Decision Boundary [Blue against Green]")

Support-Vector Machine [Polynomial]

Training

set.seed(199)
svmP.seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(svmP.seeds)-1)) svmP.seeds[[i]]<- sample.int(n=1000, 15)
svmP.seeds[[51]]<-sample.int(1000, 1)

# We create a custom SVM with Polynomial kernel so that we can tune over offset.
# The default svmPoly offered by train sets offset = 1. This should be tunable--
# or at the very least, we should be able to set offset = 0.

svmPolyCustom <- list(type = "Classification",
                      library = "kernlab",
                      loop = NULL)

prm <- data.frame(parameter = c("degree", "scale", "offset", "C"),
                  class = rep("numeric", 4),
                  label = c("Degree", "Scale", "Offset", "Cost"))

svmPolyCustom$parameters <- prm

svmGrid <- function(x, y, len = NULL, search = "grid") {
  if(search == "grid") {
    out <- expand.grid(degree = seq(1, min(len, 3)),
                       scale = 10 ^((1:len) - 4),
                       offset = (-1)^(1:len)*ceiling((0:(len-1))/2),
                       C = 2 ^((1:len) - 3))
    } else {
      out <- data.frame(degree = sample(1:3, size = len, replace = TRUE),
                        scale = 10^runif(len, min = -5, log10(2)),
                        offset = runif(len, min = -len, max = len),
                        C = 2^runif(len, min = -5, max = 10))
      }
  out
}

svmPolyCustom$grid <- svmGrid

svmFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
  if(any(names(list(...)) == "prob.model") | is.numeric(y)) {
    out <- kernlab::ksvm(x = as.matrix(x), 
                         y = y,
                         kernel = kernlab::polydot(degree = param$degree,
                                                   scale = param$scale,
                                                   offset = param$offset),
                         C = param$C, ...)
    } else {
      out <- kernlab::ksvm(x = as.matrix(x), 
                           y = y,
                           kernel = kernlab::polydot(degree = param$degree,
                                                     scale = param$scale,
                                                     offset = 0),
                           C = param$C,
                           prob.model = classProbs,
                           ...)
      }
  out
}

svmPolyCustom$fit <- svmFit

svmPred <- function(modelFit, newdata, submodels = NULL) {
                    svmPred <- function(obj, x) {
                      hasPM <- !is.null(unlist(obj@prob.model))
                      if(hasPM) {
                        pred <- kernlab::lev(obj)[apply(kernlab::predict(obj, x, type = "probabilities"), 
                                               1, which.max)]
                      } else pred <- kernlab::predict(obj, x)
                      pred
                    }
                    out <- try(svmPred(modelFit, newdata), silent = TRUE)
                    if(is.character(kernlab::lev(modelFit))) {
                      if(class(out)[1] == "try-error") {
                        warning("kernlab class prediction calculations failed; returning NAs")
                        out <- rep("", nrow(newdata))
                        out[seq(along = out)] <- NA
                      }
                    } else {
                      if(class(out)[1] == "try-error") {
                        warning("kernlab prediction calculations failed; returning NAs")
                        out <- rep(NA, nrow(newdata))
                      } 
                    }
                    if(is.matrix(out)) out <- out[,1]
                    out
                  }

svmPolyCustom$predict <- svmPred

svmProb <- function(modelFit, newdata, submodels = NULL) {
                    out <- try(kernlab::predict(modelFit, newdata, type="probabilities"),
                               silent = TRUE)
                    if(class(out)[1] != "try-error") {
                      ## There are times when the SVM probability model will
                      ## produce negative class probabilities, so we
                      ## induce vlaues between 0 and 1
                      if(any(out < 0)) {
                        out[out < 0] <- 0
                        out <- t(apply(out, 1, function(x) x/sum(x)))
                      }
                      out <- out[, kernlab::lev(modelFit), drop = FALSE]
                    } else {
                      warning("kernlab class probability calculations failed; returning NAs")
                      out <- matrix(NA, nrow(newdata) * length(kernlab::lev(modelFit)), ncol = length(kernlab::lev(modelFit)))
                      colnames(out) <- kernlab::lev(modelFit)
                    }
                    out
}

svmPolyCustom$prob <- svmProb

svmSort <- function(x) x[order(x$degree, x$C, x$scale),]

svmPolyCustom$sort <- svmSort

svmLevels <- function(x) kernlab::lev(x)

svmPolyCustom$levels <- svmLevels
svmP.grid2 <- expand.grid(degree = c(2,3,4),
                         scale = 1,
                         offset = 0,
                         C = c(0.1,1,10,100,1000))

svmP.control2 <- trainControl(method = "repeatedcv", 
                              number = 10, 
                              repeats = 5, 
                              seeds = svmP.seeds,
                              classProbs = T)

svmP.tuning2 <- train(class~.,
                      data = tuning,
                      subset = train,
                      method = svmPolyCustom,
                      metric = 'Accuracy',
                      trControl = svmP.control2,
                      tuneGrid = svmP.grid2,
                      preProcess = c("center","scale"))
## line search fails -2.049865 1.885835 1.061595e-05 -3.666528e-06 -4.598396e-08 -2.705361e-08 -3.889704e-13
svmP.tuning2
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results across tuning parameters:
## 
##   degree  C      Accuracy   Kappa    
##   2       1e-01  0.9934221  0.8838784
##   2       1e+00  0.9937700  0.8910470
##   2       1e+01  0.9936688  0.8889803
##   2       1e+02  0.9937257  0.8898182
##   2       1e+03  0.9920749  0.8339831
##   3       1e-01  0.9940040  0.8956550
##   3       1e+00  0.9947440  0.9116591
##   3       1e+01  0.9951488  0.9186073
##   3       1e+02  0.9950350  0.9162175
##   3       1e+03  0.9950539  0.9167053
##   4       1e-01  0.9921318  0.8567759
##   4       1e+00  0.9932450  0.8796333
##   4       1e+01  0.9933715  0.8819879
##   4       1e+02  0.9933336  0.8808683
##   4       1e+03  0.9824001  0.5572732
## 
## Tuning parameter 'scale' was held constant at a value of 1
## Tuning
##  parameter 'offset' was held constant at a value of 0
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 3, scale = 1, offset = 0
##  and C = 10.
svmP.tuning2$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 10 
## 
## Polynomial kernel function. 
##  Hyperparameters : degree =  3  scale =  1  offset =  0 
## 
## Number of Support Vectors : 405 
## 
## Objective Function Value : -3464.73 
## Training error : 0.004839 
## Probability model included.
svmP.tuning2.preds <- predict(svmP.tuning2, newdata = testX)
confusionMatrix(svmP.tuning2.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30565    83
##      tarp     44   928
##                                           
##                Accuracy : 0.996           
##                  95% CI : (0.9952, 0.9967)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9339          
##                                           
##  Mcnemar's Test P-Value : 0.0007464       
##                                           
##             Sensitivity : 0.91790         
##             Specificity : 0.99856         
##          Pos Pred Value : 0.95473         
##          Neg Pred Value : 0.99729         
##              Prevalence : 0.03197         
##          Detection Rate : 0.02935         
##    Detection Prevalence : 0.03074         
##       Balanced Accuracy : 0.95823         
##                                           
##        'Positive' Class : tarp            
## 

Holdout

svmP.holdout.preds2 <- predict(svmP.tuning2, newdata = holdoutX)
confusionMatrix(svmP.holdout.preds2, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1978270     133
##      tarp    11427   14347
##                                           
##                Accuracy : 0.9942          
##                  95% CI : (0.9941, 0.9943)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7101          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.990815        
##             Specificity : 0.994257        
##          Pos Pred Value : 0.556646        
##          Neg Pred Value : 0.999933        
##              Prevalence : 0.007225        
##          Detection Rate : 0.007159        
##    Detection Prevalence : 0.012860        
##       Balanced Accuracy : 0.992536        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
svmP.holdout.probs2 <- predict(svmP.tuning2, newdata = holdoutX, type = "prob")

svmP.rates2 <- prediction(svmP.holdout.probs2[2], holdoutLabels)
svmP.ROC2 <- performance(svmP.rates2, measure = "tpr", x.measure = "fpr")

svmP.auc2 <- performance(svmP.rates2, measure = "auc")
svmP.auc2@y.values #auc = 0.9991007
## [[1]]
## [1] 0.9991007
# colorize ROC.
svmP.roc.df2 <- data.frame(c("FPR" = svmP.ROC2@x.values, "TPR" = svmP.ROC2@y.values, "Threshold" = svmP.ROC2@alpha.values))
svmP.roc.df2$Threshold[1] <- 1 # replace infinity

svmP.roc.df2 %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of SVM [Polynomial: degree = 3, cost = 10]")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9991")

# predict over boundary.grid.
boundary.probs <- predict(svmP.tuning2, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(svmP.holdout.preds2==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
  geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
              size = 2, stroke = 1.3, 
              aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("SVM [Polynomial d = 3] Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("SVM [Polynomial d = 3] Decision Boundary [Blue against Green]")

Support-Vector Machine [RBF]

Training

set.seed(199)
svmR.seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(svmR.seeds)-1)) svmR.seeds[[i]]<- sample.int(n=1000, 25)
svmR.seeds[[51]]<-sample.int(1000, 1)

svmR.grid <- expand.grid(sigma = c(0.5,1,2,5,10),
                         C = c(0.1,1,10,100,1000))

svmR.control1 <- trainControl(method = "repeatedcv", 
                              number = 10, 
                              repeats = 5, 
                              seeds = svmR.seeds,
                              classProbs = T)

svmR.tuning1 <- train(class~.,
                      data = tuning,
                      subset = train,
                      method = 'svmRadial',
                      metric = 'Accuracy',
                      trControl = svmR.control1,
                      tuneGrid = svmR.grid,
                      preProcess = c("center","scale"))
svmR.tuning1
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 63241 samples
##     3 predictor
##     2 classes: 'other', 'tarp' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28459, 28459, ... 
## Resampling results across tuning parameters:
## 
##   sigma  C      Accuracy   Kappa    
##    0.5   1e-01  0.9953132  0.9241391
##    0.5   1e+00  0.9957560  0.9311194
##    0.5   1e+01  0.9960533  0.9356457
##    0.5   1e+02  0.9965972  0.9444995
##    0.5   1e+03  0.9967553  0.9470093
##    1.0   1e-01  0.9953702  0.9249098
##    1.0   1e+00  0.9959837  0.9346783
##    1.0   1e+01  0.9963505  0.9405322
##    1.0   1e+02  0.9967364  0.9467084
##    1.0   1e+03  0.9966731  0.9457134
##    2.0   1e-01  0.9955156  0.9269026
##    2.0   1e+00  0.9959457  0.9336071
##    2.0   1e+01  0.9965972  0.9445584
##    2.0   1e+02  0.9967111  0.9464128
##    2.0   1e+03  0.9967933  0.9476478
##    5.0   1e-01  0.9959900  0.9344407
##    5.0   1e+00  0.9962683  0.9385540
##    5.0   1e+01  0.9967237  0.9462793
##    5.0   1e+02  0.9968439  0.9486944
##    5.0   1e+03  0.9969388  0.9504169
##   10.0   1e-01  0.9961734  0.9372800
##   10.0   1e+00  0.9965846  0.9439289
##   10.0   1e+01  0.9967617  0.9467812
##   10.0   1e+02  0.9968502  0.9485227
##   10.0   1e+03  0.9965023  0.9429719
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 5 and C = 1000.
svmR.tuning1$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1000 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  5 
## 
## Number of Support Vectors : 239 
## 
## Objective Function Value : -168491.2 
## Training error : 0.002245
svmR.tuning1.preds <- predict(svmR.tuning1, newdata = testX)
confusionMatrix(svmR.tuning1.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30558    28
##      tarp     51   983
##                                          
##                Accuracy : 0.9975         
##                  95% CI : (0.9969, 0.998)
##     No Information Rate : 0.968          
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.9601         
##                                          
##  Mcnemar's Test P-Value : 0.01332        
##                                          
##             Sensitivity : 0.97230        
##             Specificity : 0.99833        
##          Pos Pred Value : 0.95068        
##          Neg Pred Value : 0.99908        
##              Prevalence : 0.03197        
##          Detection Rate : 0.03109        
##    Detection Prevalence : 0.03270        
##       Balanced Accuracy : 0.98532        
##                                          
##        'Positive' Class : tarp           
## 

Holdout

svmR.holdout.preds <- predict(svmR.tuning1, newdata = holdoutX)
confusionMatrix(svmR.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1974633    3595
##      tarp    15064   10885
##                                           
##                Accuracy : 0.9907          
##                  95% CI : (0.9906, 0.9908)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5342          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.751727        
##             Specificity : 0.992429        
##          Pos Pred Value : 0.419477        
##          Neg Pred Value : 0.998183        
##              Prevalence : 0.007225        
##          Detection Rate : 0.005431        
##    Detection Prevalence : 0.012947        
##       Balanced Accuracy : 0.872078        
##                                           
##        'Positive' Class : tarp            
## 
# Predict Probabilities for ROC
svmR.holdout.probs <- predict(svmR.tuning1, newdata = holdoutX, type = 'prob')

svmR.rates <- prediction(svmR.holdout.probs[2], holdoutLabels)
svmR.ROC <- performance(svmR.rates, measure = "tpr", x.measure = "fpr")

svmR.auc <- performance(svmR.rates, measure = "auc")
svmR.auc@y.values #auc = 0.986921
## [[1]]
## [1] 0.986921
# colorize ROC
svmR.roc.df <- data.frame(c("FPR" = svmR.ROC@x.values, "TPR" = svmR.ROC@y.values, "Threshold" = svmR.ROC@alpha.values))
svmR.roc.df$Threshold[1] <- 1 # replace infinity

svmR.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of SVM [RBF: gamma = 5, cost = 1000]")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9869")

# predict over boundary.grid.
boundary.probs <- predict(svmR.tuning1, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])

new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(svmR.holdout.preds==holdout$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=red, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
  geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
              size = 2, stroke = 1.3, 
              aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("SVM RBF
          Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1.0001,length.out=14),
                        aes(x=green, y=blue, z=tarp), bins = 13)+
  scale_fill_manual(values = colPal,guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("SVM RBF Decision Boundary [Blue against Green]")

Mahalanobis Classifier

Professor’s Method

Training

We evaluate the accuracy of Professor Basener’s Mahalanobis Distance Classifier. The only parameter for this method is the p-value threshold used to label the testing points. Ideally, we would tune this with cross-validation. In the interest of simplicity, we just use a test/train split.

library(MASS)

# create copy of tuning
training.copy <- tuning[train, ]
testing.copy <- tuning[-train, ]

# create df with only tarps
tarpsOnly <- training.copy[which(training.copy$class == "tarp"), ]

# create dfs with only RGB values for
tarpsOnly_rgb <- tarpsOnly[, -1]
testing.copy_rgb <- testing.copy[, -1]


# Compute Mahalanobis Distances
testing.copy$mahalanobis <- mahalanobis(testing.copy_rgb, 
                                       colMeans(tarpsOnly_rgb), 
                                       cov(tarpsOnly_rgb))
testing.copy$pvalue <- pchisq(testing.copy$mahalanobis, df=3, lower.tail=FALSE)

# idea: create ROC-like plot...
# idea: line plot accuracy (and TPR and FPR) against Mahalanobis Dist to select cut off
# use p-values to set threshold
threshold.mesh <- seq(0.01,0.99,length.out=99)
acc <- c()
tpr <- c()
fpr <- c()
ppv <- c()

for (tr in 1:length(threshold.mesh)){
  predicted.labels <- ifelse(testing.copy$pvalue>=threshold.mesh[tr], "tarp", "other")
  cf.matrix <- table(predicted.labels, testing.copy$class)

  acc[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
  tpr[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
  fpr[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
  ppv[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}

threshold.df <- cbind(threshold.mesh, acc, tpr, fpr, ppv) %>% data.frame()
colnames(threshold.df) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df$threshold <- as.numeric(threshold.df$threshold)

#ideal threshold based on overall accuracy
(best.thresh <- threshold.df$threshold[which.max(threshold.df$accuracy)])
## [1] 0.27
plot(x=threshold.df$threshold, y=threshold.df$accuracy,
     xlab = "p-value Threshold",
     ylab = " Accuracy",
     main = "Prediction Accuracy against p-value Threshold",
     type = "l")

The test/train split chooses a p-value threshold of 0.27. Observations with p-values greater than 0.27 are classified as tarp, and observations with p-values below 0.27 are classified as other. We now, view the confusion matrix for the testing data at this threshold.

Mah.labels <- ifelse(testing.copy$pvalue>=best.thresh, "tarp", "other") %>% factor()
confusionMatrix(Mah.labels, testing.copy$class, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction other  tarp
##      other 30506   266
##      tarp    103   745
##                                           
##                Accuracy : 0.9883          
##                  95% CI : (0.9871, 0.9895)
##     No Information Rate : 0.968           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7955          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.73689         
##             Specificity : 0.99663         
##          Pos Pred Value : 0.87854         
##          Neg Pred Value : 0.99136         
##              Prevalence : 0.03197         
##          Detection Rate : 0.02356         
##    Detection Prevalence : 0.02682         
##       Balanced Accuracy : 0.86676         
##                                           
##        'Positive' Class : tarp            
## 

Holdout

We now apply the classifier to the holdout data.

#create copy of holdout data
holdout.copy <- holdout
holdout.copy_rgb <- holdout.copy[,-4]

# compute Mah Dist and p-values for each obsv in holdout
holdout.copy$mahalanobis <- mahalanobis(holdout.copy_rgb, 
                                       colMeans(tarpsOnly_rgb), 
                                       cov(tarpsOnly_rgb))
holdout.copy$pvalue <- pchisq(holdout.copy$mahalanobis, df=3, lower.tail=FALSE)

# ----
acc.h <- c()
tpr.h <- c()
fpr.h <- c()
ppv.h <- c()

for (tr in 1:length(threshold.mesh)){
  predicted.labels <- ifelse(holdout.copy$pvalue>=threshold.mesh[tr], "tarp", "other")
  cf.matrix <- table(predicted.labels, holdout.copy$class)

  acc.h[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
  tpr.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
  fpr.h[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
  ppv.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}

threshold.df.h <- cbind(threshold.mesh, acc.h, tpr.h, fpr.h, ppv.h) %>% data.frame()
colnames(threshold.df.h) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df.h$threshold <- as.numeric(threshold.df.h$threshold)

#ideal threshold based on overall accuracy
(best.thresh.h <- threshold.df.h$threshold[which.max(threshold.df.h$accuracy)])
## [1] 0.53
plot(x=threshold.df.h$threshold, y=threshold.df.h$accuracy,
     xlab = "p-value Threshold",
     ylab = " Accuracy",
     main = "Prediction Accuracy against p-value Threshold",
     type = "l")

# apply labels using p-value threshold = 0.27
predicted.labels.holdout <- ifelse(holdout.copy$pvalue >= best.thresh.h, "tarp", "other") %>% factor()
# confusionMatrix
confusionMatrix(predicted.labels.holdout, holdout.copy$class, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1989116   13082
##      tarp      581    1398
##                                           
##                Accuracy : 0.9932          
##                  95% CI : (0.9931, 0.9933)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 3.128e-12       
##                                           
##                   Kappa : 0.1684          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.0965470       
##             Specificity : 0.9997080       
##          Pos Pred Value : 0.7064174       
##          Neg Pred Value : 0.9934662       
##              Prevalence : 0.0072249       
##          Detection Rate : 0.0006975       
##    Detection Prevalence : 0.0009874       
##       Balanced Accuracy : 0.5481275       
##                                           
##        'Positive' Class : tarp            
## 
mahalanobis.holdout.probs <- holdout.copy$pvalue

mahalanobis.rates <- prediction(mahalanobis.holdout.probs, holdout.copy$class)
mahalanobis.ROC <- performance(mahalanobis.rates, measure = 'tpr', x.measure = 'fpr')

mahalanobis.auc <- performance(mahalanobis.rates, measure = "auc")
mahalanobis.auc@y.values #auc = 0.7543164
## [[1]]
## [1] 0.762413
# colorize ROC
mahalanobis.roc.df <- data.frame(c("FPR" = mahalanobis.ROC@x.values, 
                                "TPR" = mahalanobis.ROC@y.values, 
                                "Threshold" = mahalanobis.ROC@alpha.values))
mahalanobis.roc.df$Threshold[1] <- 1 # replace infinity

mahalanobis.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of Mahalanobis Distance Classifier")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.7624")

boundary.dist <- mahalanobis(boundary.grid,
                             colMeans(tarpsOnly_rgb),
                             cov(tarpsOnly_rgb))
boundary.probs <- pchisq(boundary.dist, df = 3, lower.tail = F)

boundary.df <- cbind(boundary.grid, boundary.probs, boundary.dist)

holdout.copy$labels <- ifelse(holdout.copy$pvalue >= best.thresh.h, "tarp", "other")
holdout.copy$hitmiss <- factor(ifelse(holdout.copy$labels==holdout.copy$class,'hit', 'miss'))

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==186)),
                        breaks =seq(0,1,length.out = 14),
                        aes(x=red, y=blue, z=boundary.probs), bins = 13)+
  scale_fill_manual(values = colPal, guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+        
    geom_jitter(data = holdout.copy[which(holdout.copy$green==186), ],
                size = 2, stroke = 1.3,
                aes(x=red, y=blue, color=class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Mahalanobis Classifier Decision Boundary [Blue against Red]")

ggplot(data = NULL)+
    geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
                        breaks = seq(0,1,length.out = 14),
                        aes(x=green, y=blue, z=boundary.probs), bins = 13)+
  scale_fill_manual(values = colPal, guide = "none")+
    theme(axis.line = element_line(color='gray'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    geom_jitter(data = holdout.copy[which(holdout.copy$red==140), ],
                size = 2, stroke = 1.3,
                aes(x=green, y=blue, color = class, shape = hitmiss))+
  scale_color_manual(values=c("#964B00", "#153250"))+
  scale_shape_manual(values=c(20,4))+
  ggtitle("Mahalanobis Classifier Decision Boundary [Blue against Green]")

Mahalanobis with PCA Projection?

The overall accuracy of the Mahalanobis Classifier method is quite good, and comparable to the other methods tested. However, the sensitivity (TPR) is exceptionally poor. We ask, will creating a PCA projection of the RGB values lead to a significant increase in sensitivity?

# Create Principal Components from training subset; save loadings
train_pc <- prcomp(training.copy[, 2:4], center = T, scale = T)
loadings <- train_pc$rotation

# reattach labels
train_pc_copy <- train_pc$x %>% data.frame()
train_pc_copy$class <- tuning[train, "class"]


#section out tarps
tarpsOnly_pc <- train_pc_copy[which(train_pc_copy$class == "tarp"), ]

tarpsOnly_pc_rgb <- tarpsOnly_pc[,1:3]

holdout_pc_rgb <- scale(holdout.copy[1:3], train_pc$center, train_pc$scale) %*% loadings

holdout_pc_copy <- holdout_pc_rgb %>% data.frame()
holdout_pc_copy$class <- holdout$class

# calculate mahalanobis dist
holdout_pc_copy$mahalanobis <- mahalanobis(holdout_pc_rgb, 
                                       colMeans(tarpsOnly_pc_rgb), 
                                       cov(tarpsOnly_pc_rgb))
holdout_pc_copy$pvalue <- pchisq(holdout_pc_copy$mahalanobis, df=3, lower.tail=FALSE)

# ----
acc.h <- c()
tpr.h <- c()
fpr.h <- c()
ppv.h <- c()

for (tr in 1:length(threshold.mesh)){
  predicted.labels <- ifelse(holdout_pc_copy$pvalue>=threshold.mesh[tr], "tarp", "other")
  cf.matrix <- table(predicted.labels, holdout_pc_copy$class)

  acc.h[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
  tpr.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
  fpr.h[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
  ppv.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}

threshold.df.h <- cbind(threshold.mesh, acc.h, tpr.h, fpr.h, ppv.h) %>% data.frame()
colnames(threshold.df.h) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df.h$threshold <- as.numeric(threshold.df.h$threshold)

#ideal threshold based on overall accuracy
(best.thresh.h <- threshold.df.h$threshold[which.max(threshold.df.h$accuracy)])
## [1] 0.53
plot(x=threshold.df.h$threshold, y=threshold.df.h$accuracy,
     xlab = "p-value Threshold",
     ylab = " Accuracy",
     main = "Prediction Accuracy against p-value Threshold",
     type = "l")

# apply labels using p-value threshold = 0.27
predicted.labels.holdout <- ifelse(holdout_pc_copy$pvalue >= best.thresh.h, "tarp", "other") %>% factor()
# confusionMatrix
confusionMatrix(predicted.labels.holdout, holdout_pc_copy$class, positive = "tarp")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   other    tarp
##      other 1989116   13082
##      tarp      581    1398
##                                           
##                Accuracy : 0.9932          
##                  95% CI : (0.9931, 0.9933)
##     No Information Rate : 0.9928          
##     P-Value [Acc > NIR] : 3.128e-12       
##                                           
##                   Kappa : 0.1684          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.0965470       
##             Specificity : 0.9997080       
##          Pos Pred Value : 0.7064174       
##          Neg Pred Value : 0.9934662       
##              Prevalence : 0.0072249       
##          Detection Rate : 0.0006975       
##    Detection Prevalence : 0.0009874       
##       Balanced Accuracy : 0.5481275       
##                                           
##        'Positive' Class : tarp            
## 
# ---
mahalanobis.holdout.probs <- holdout_pc_copy$pvalue

mahalanobis.rates <- prediction(mahalanobis.holdout.probs, holdout_pc_copy$class)
mahalanobis.ROC <- performance(mahalanobis.rates, measure = 'tpr', x.measure = 'fpr')

mahalanobis.auc <- performance(mahalanobis.rates, measure = "auc")
mahalanobis.auc@y.values #auc = 0.7543164
## [[1]]
## [1] 0.762413
# colorize ROC
mahalanobis.roc.df <- data.frame(c("FPR" = mahalanobis.ROC@x.values, 
                                "TPR" = mahalanobis.ROC@y.values, 
                                "Threshold" = mahalanobis.ROC@alpha.values))
mahalanobis.roc.df$Threshold[1] <- 1 # replace infinity

mahalanobis.roc.df %>% ggplot(aes(FPR, TPR))+
    geom_line(aes(color=Threshold), lwd = 2)+
    scale_colour_viridis(option = "plasma")+
    geom_abline(lwd = 1, color = "gray")+
    theme_bw()+
    theme(axis.line = element_line(color='gray'),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())+
    ggtitle("ROC of Mahalanobis Distance Classifier")+
    annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.7624")

cov(tarpsOnly_pc_rgb)
##             PC1          PC2           PC3
## PC1  1.07466140 0.2606116466 -0.0376292406
## PC2  0.26061165 0.1200329881  0.0009582995
## PC3 -0.03762924 0.0009582995  0.0087330840
cov(tarpsOnly_rgb)
##            red    green     blue
## red   1221.296 1480.892 1484.532
## green 1480.892 1917.161 2004.784
## blue  1484.532 2004.784 2437.707